#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-InitialExperiments/SFARI_genes/R_Markdowns/')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis)
library(biomaRt)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load raw count expression matrix and pubmed counts by gene
datExpr = read.csv('./../../FirstYearReview/Data/Gandal/RNAseq_ASD_datExpr.csv')
pubmed_counts = read.csv('./../Data/pubmed_count_by_gene.csv')
Convert ensembl IDs to gene names
# Get gene names from Ensembl IDs
getinfo = c('ensembl_gene_id','external_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org')
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=datExpr$X, mart=mart)
## Cache found
datGenes = datGenes[match(datExpr$X, datGenes$ensembl_gene_id),]
rm(getinfo, mart)
datGenes = datGenes %>% mutate('meanExpr' = rowMeans(datExpr[,-1])) %>%
left_join(pubmed_counts, by=c('external_gene_id'='Gene')) %>% na.omit
Very heavy right tail
ggplotly(datGenes %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
Genes with the highest number of publications have names like MICE, LARGE, IMPACT, ACHE, SET, PIGS, NHS … so I think most of thir related publications are not about those actual genes…
summary(datGenes$Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 723 10 7711933
cat(paste0(sum(datGenes$Count==0), ' genes have 0 publications (~', round(100*mean(datGenes$Count==0)), '%) (that\'s a lot!)'))
## 38694 genes have 0 publications (~61%) (that's a lot!)
datGenes %>% arrange(desc(Count)) %>% distinct(external_gene_id, .keep_all=TRUE) %>%
head(20) %>% kable(caption='Top 20 genes by number of publications')
| ensembl_gene_id | external_gene_id | gene_biotype | meanExpr | Count |
|---|---|---|---|---|
| ENSG00000180176 | TH | protein_coding | 5.4090909 | 7711933 |
| ENSG00000177606 | JUN | protein_coding | 1970.3750000 | 2604914 |
| ENSG00000136999 | NOV | protein_coding | 348.3522727 | 2332690 |
| ENSG00000228491 | MICE | pseudogene | 0.0000000 | 1634646 |
| ENSG00000173599 | PC | protein_coding | 803.6136364 | 1464096 |
| ENSG00000133424 | LARGE | protein_coding | 1544.2727273 | 1393404 |
| ENSG00000164458 | T | protein_coding | 0.3522727 | 1215682 |
| ENSG00000154059 | IMPACT | protein_coding | 292.6931818 | 910176 |
| ENSG00000087085 | ACHE | protein_coding | 234.3409091 | 818059 |
| ENSG00000119335 | SET | protein_coding | 1844.2500000 | 502402 |
| ENSG00000062485 | CS | protein_coding | 973.9204545 | 407959 |
| ENSG00000087111 | PIGS | protein_coding | 375.0795455 | 374551 |
| ENSG00000115718 | PROC | protein_coding | 3.9431818 | 359016 |
| ENSG00000168453 | HR | protein_coding | 701.6818182 | 256256 |
| ENSG00000175084 | DES | protein_coding | 31.4772727 | 217417 |
| ENSG00000105976 | MET | protein_coding | 780.5568182 | 186947 |
| ENSG00000188158 | NHS | protein_coding | 572.5681818 | 169222 |
| ENSG00000204490 | TNF | protein_coding | 0.0000000 | 167834 |
| ENSG00000169083 | AR | protein_coding | 163.5568182 | 166496 |
| ENSG00000010610 | CD4 | protein_coding | 204.3068182 | 164440 |
I’m going to remove all top genes until NHS (it looks like the TNF publications are actually about that gene) and then some other genes that have high counts and their names are known words (COPE, REST, MAX, ENG, SHE, CAT, COMP, CAMP, BAD, COIL, POLE, ACE, GRASP, CAST, …)
problematic_gene_names = c('COPE','REST','MAX','ENG','SHE','CAT','COMP','CAMP','BAD','COIL','POLE','ACE',
'GRASP','CAST','SON','CLOCK','ITCH','STAR','WARS','MARS','MARCO','ATM','CARS')
datGenes = datGenes %>% filter(Count<=167834 & !external_gene_id %in% problematic_gene_names)
ggplotly(datGenes %>% ggplot(aes(Count+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
scale_x_log10() + theme_minimal())
There is a positive relation between these two variables and the correlation is quite high but only when transforming to logarithmic scale the variables
The highest expressed gene is TNF with 167K articles
R^2 is the square of the correlation coefficient, so with one calculation we can compute both
corr = cor(log10(datGenes$meanExpr+1), log10(datGenes$Count+1))
datGenes %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(color='#0099cc', alpha=0.03) +
geom_smooth(color='gray', alpha=0.3) + scale_x_log10() + scale_y_log10() +
ggtitle(paste0('Cor = (' ,round(corr,4),') R^2 = (', round(corr^2,4),')')) + theme_minimal() + coord_fixed()
Differentiating protein coding genes from the rest, the correlation decreases when separating the genes, it seems like part of the reason why the correlation was so high because is because it was combining two behaviours, non-protein-coding genes with low levels of expression and low numbers of papers, and protein-coding genes, with higher levels of expression and of publications.
The red stripes on the plot correspond to non coding genes that have many ensembl IDs, like Y_RNA, snoU13, U3, or SNORD112
The non coding gene with the largest number of publications is the pseudogene MICE
cat(paste0(round(100*(sum(datGenes$gene_biotype=='protein_coding' & datGenes$Count==0)/sum(datGenes$gene_biotype=='protein_coding'))),
'% of the protein coding genes have zero publications'))
## 13% of the protein coding genes have zero publications
corr1 = cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype=='protein_coding'],
log10(datGenes$Count+1)[datGenes$gene_biotype=='protein_coding'])
corr2 = cor(log10(datGenes$meanExpr+1)[datGenes$gene_biotype!='protein_coding'],
log10(datGenes$Count+1)[datGenes$gene_biotype!='protein_coding'])
datGenes %>% mutate(protein_coding = gene_biotype=='protein_coding') %>%
ggplot(aes(meanExpr+1, Count+1, color=protein_coding)) +
geom_point(alpha=0.03) + geom_smooth() +
scale_x_log10() + scale_y_log10() +
ggtitle(paste0('Cor PC=', round(corr1,4), ', ', 'Cor nPC=', round(corr2,4))) +
theme_minimal() + coord_fixed()
rm(corr1, corr2)
datGenesPC = datGenes %>% filter(gene_biotype=='protein_coding') %>% mutate(meanExpr_l10 = log10(meanExpr+1),
Count_l10 = log10(Count+1))
corr = cor(datGenesPC$meanExpr_l10, datGenesPC$Count_l10)
datGenesPC %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(alpha=0.03, color='#0099cc') + geom_smooth(color='gray') +
scale_x_log10() + scale_y_log10() +
ggtitle(paste0('Correlation = ',round(corr,4),', R^2 = ', round(corr^2,4))) +
theme_minimal() + coord_fixed()
Filtering genes with the lowest levels of expression
datGenesPC %>% ggplot(aes(meanExpr+1)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
geom_vline(xintercept=10, color='gray') + scale_x_log10() + theme_minimal()
datGenesPC = datGenesPC %>% filter(meanExpr>=9)
corr = cor(datGenesPC$meanExpr_l10, datGenesPC$Count_l10)
datGenesPC %>% ggplot(aes(meanExpr+1, Count+1)) + geom_point(alpha=0.07, color='#0099cc') + geom_smooth(color='gray') +
scale_x_log10() + scale_y_log10() +
ggtitle(paste0('Correlation = ',round(corr,4),', R^2 = ', round(corr^2,4))) +
theme_minimal() + coord_fixed()